{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading 20news dataset. This may take a few minutes.\n",
      "Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n"
     ]
    }
   ],
   "source": [
    "dataset = fetch_20newsgroups(shuffle=True, random_state=1,\n",
    "                             remove=('headers', 'footers', 'quotes'))\n",
    "documents = dataset.data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "no_features = 1000\n",
    "\n",
    "tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n",
    "                                   max_features=no_features,\n",
    "                                   stop_words='english')\n",
    "tfidf = tfidf_vectorizer.fit_transform(documents)\n",
    "tfidf_feature_names = tfidf_vectorizer.get_feature_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from minisom import MiniSom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "som = MiniSom(2, 4, no_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "D = tfidf.todense().tolist()\n",
    "som.pca_weights_init(D)\n",
    "som.train_batch(D, 40000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic 1 : don like hear good think year game did games team\n",
      "Topic 2 : day know like people religion said law don didn evidence\n",
      "Topic 3 : use like word speed digital mouse need sure know ll\n",
      "Topic 4 : file set use files driver printer problem pc windows mode\n",
      "Topic 5 : good assume people want clinton read just book money like\n",
      "Topic 6 : good want know right like god does don jesus people\n",
      "Topic 7 : don use air cards chips stuff com sun does know\n",
      "Topic 8 : info use new appreciated help scsi drive mail advance thanks\n"
     ]
    }
   ],
   "source": [
    "top_keywords = 10\n",
    "\n",
    "weights = som.get_weights()\n",
    "cnt = 1\n",
    "for i in range(2):\n",
    "    for j in range(4):\n",
    "        keywords_idx = np.argsort(weights[i,j,:])[-top_keywords:]\n",
    "        keywords = ' '.join([tfidf_feature_names[k] for k in keywords_idx])\n",
    "        print('Topic', cnt, ':', keywords)\n",
    "        #plt.text(i, j, tfidf_feature_names[])\n",
    "        #plt.plot(i, j, '.')\n",
    "        cnt += 1"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
